! (C) Copyright 2000- ECMWF.
! (C) Copyright 2000- Meteo-France.
! (C) Copyright 2022- NVIDIA.
! 
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE TRMTOL_PACK_UNPACK
  USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE
  IMPLICIT NONE

  PRIVATE
  PUBLIC :: TRMTOL_PACK, TRMTOL_PACK_HANDLE, PREPARE_TRMTOL_PACK
  PUBLIC :: TRMTOL_UNPACK, TRMTOL_UNPACK_HANDLE, PREPARE_TRMTOL_UNPACK

  TYPE TRMTOL_PACK_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN
  END TYPE
  TYPE TRMTOL_UNPACK_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL
  END TYPE

CONTAINS
  FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT
    USE TPM_DISTR,              ONLY: D
    USE ISO_C_BINDING,          ONLY: C_SIZE_T
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG

    TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK

    INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ

    REAL(KIND=JPRBT) :: ZPRBT_DUMMY

    IALLOC_SZ = D%NLENGT1B*2*KF_LEG*SIZEOF(ZPRBT_DUMMY)
    HTRMTOL_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, int(IALLOC_SZ,kind=c_size_t))
  END FUNCTION
  SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG)

    !**** *TRMTOL_PACK* - Packing buffer for TRMTOL

    !     Purpose.
    !     --------
    !        Packs data from LTINV outputs into FOUBUF for conversion to fourier space

    !**   Interface.
    !     ----------
    !        CALL TRMTOL_PACK(...)

    !        Explicit arguments :  ZOUTS - symmetric data
    !        --------------------  ZOUTA - asymmetric data
    !                              ZOUTS0 - symmetric data for KMLOC0
    !                              ZOUTA0 - asymmetric data for KMLOC0
    !                              FOUBUF_IN - output towards TRMTOL
    !                              KF_LEG - number of fields (we have 2XKF_LEG because complex)

    !        Implicit arguments :  None.
    !        --------------------

    !     Externals.
    !     ----------

    !     Reference.
    !     ----------
    !        ECMWF Research Department documentation of the IFS

    !     Author.
    !     -------
    !      Nils Wedi + Mats Hamrud + George Modzynski
    !
    !     Modifications.
    !     --------------
    !        J.Hague : Oct 2012 DR_HOOK round calls to DGEMM:
    !      F. Vana  05-Mar-2015  Support for single precision
    !     ------------------------------------------------------------------

    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRB, JPRBT, JPRD
    USE YOMHOOK,                ONLY: LHOOK, DR_HOOK, JPHOOK
    USE TPM_DIM,                ONLY: R_NDGNH, R_NDGL
    USE TPM_GEOMETRY,           ONLY: G_NDGLU
    USE TPM_DISTR,              ONLY: D, D_NUMP, D_MYMS, D_NPNTGTB1, D_OFFSETS_GEMM1
    USE LEINV_MOD,              ONLY: LEINV_STRIDES
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
    USE ISO_C_BINDING,          ONLY: C_SIZE_T

    IMPLICIT NONE


    !     DUMMY ARGUMENTS
    TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
    TYPE(TRMTOL_PACK_HANDLE), INTENT(IN) :: HTRMTOL_PACK
    REAL(KIND=JPRB), INTENT(OUT), POINTER :: FOUBUF_IN(:)
    REAL(KIND=JPRBT), INTENT(IN) :: ZOUTS(:), ZOUTA(:)
    REAL(KIND=JPRD), INTENT(IN) :: ZOUTS0(:), ZOUTA0(:)
    INTEGER(KIND=JPIM), INTENT(IN)  :: KF_LEG

    !     LOCAL
    REAL(KIND=JPRBT) :: ZAOA, ZSOA

    INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS, OFFSET1, OFFSET2
    INTEGER(KIND=JPIM)  :: IOUT_STRIDES0, IOUT_SIZE
    INTEGER(KIND=JPIM)  :: IOUT0_STRIDES0, IOUT0_SIZE

    REAL(KIND=JPHOOK) :: ZHOOK_HANDLE

    IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',0,ZHOOK_HANDLE)

    CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),&
      & 1_C_SIZE_T, int(D%NLENGT1B*2*KF_LEG*SIZEOF(FOUBUF_IN(1)),kind=c_size_t))

    CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,&
                       IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE)

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA PRESENT(D_MYMS,D_NPNTGTB1,D_NUMP,G_NDGLU,R_NDGNH,R_NDGL) &
    !$ACC&     PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1)
#endif

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) &
    !$ACC&              FIRSTPRIVATE(KF_LEG,IOUT_STRIDES0,IOUT0_STRIDES0) ASYNC(1)
#endif
    DO KMLOC=1,D_NUMP
      DO JGL=1,R_NDGNH
        DO JK=1,2*KF_LEG
          KM = D_MYMS(KMLOC)
          ISL = R_NDGNH-G_NDGLU(KM)+1
          IF (JGL >= ISL) THEN
            !(DO JGL=ISL,R_NDGNH)
            IGLS = R_NDGL+1-JGL
            OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_LEG
            OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_LEG

            IF(KM /= 0) THEN
              ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0)
              ZAOA = ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0)
            ELSEIF (MOD((JK-1),2) .EQ. 0) THEN
              ZSOA = ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0)
              ZAOA = ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0)
            ELSE
              ! Imaginary values of KM=0 is zero, though I don't think we care
              ZSOA = 0_JPRBT
              ZAOA = 0_JPRBT
            ENDIF

            FOUBUF_IN(OFFSET1+JK) = ZAOA+ZSOA
            FOUBUF_IN(OFFSET2+JK) = ZSOA-ZAOA
          ENDIF
        ENDDO
      ENDDO
    ENDDO

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC WAIT(1)

    !$ACC END DATA
#endif

    IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',1,ZHOOK_HANDLE)

  END SUBROUTINE TRMTOL_PACK

  FUNCTION PREPARE_TRMTOL_UNPACK(ALLOCATOR,KF_FS) RESULT(HTRMTOL_UNPACK)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT
    USE TPM_DISTR,              ONLY: D
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
    USE ISO_C_BINDING,          ONLY: C_SIZE_T

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM) :: KF_FS

    TYPE(TRMTOL_UNPACK_HANDLE) :: HTRMTOL_UNPACK

    REAL(KIND=JPRBT) :: DUMMY

    HTRMTOL_UNPACK%HREEL = RESERVE(ALLOCATOR, int(D%NLENGTF*KF_FS*SIZEOF(DUMMY),kind=c_size_t))

  END FUNCTION PREPARE_TRMTOL_UNPACK
SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL)

!**** *TRMTOL_UNPACK* - Copy fourier data from buffer to local array

!     Purpose.
!     --------
!        Routine for copying fourier data from buffer to local array

!**   Interface.
!     ----------
!     CALL TRMTOL_UNPACK(...)

!     Explicit arguments :  PREEL_COMPLEX - local fourier/GP array
!     --------------------  KF_CURRENT - number of fields that are read (from Legendre space)
!                           KF_TOTAL - total fields in PREEL ("stride")
!
!     Externals.  None.
!     ----------

!     Author.
!     -------
!        Mats Hamrud *ECMWF*

!     Modifications.
!     --------------
!        Original : 2000-04-01

!     ------------------------------------------------------------------

USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT
USE TPM_DISTR,              ONLY: D, MYSETW, D_NSTAGTF, D_NPNTGTB0, D_NPTRLS, D_NDGL_FS
USE TPM_GEOMETRY,           ONLY: G_NMEN, G_NLOEN, G_NLOEN_MAX
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
USE ISO_C_BINDING,          ONLY: C_SIZE_T
!

IMPLICIT NONE

REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF(:)
REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PREEL_COMPLEX(:)
INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL
TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
TYPE(TRMTOL_UNPACK_HANDLE), INTENT(IN) :: HTRMTOL_UNPACK

INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL
REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX

CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),&
    & 1_C_SIZE_T, int(KF_TOTAL*D%NLENGTF*SIZEOF(PREEL_COMPLEX(1)),kind=c_size_t))

#ifdef OMPGPU
#endif
#ifdef ACCGPU
!$ACC DATA PRESENT(G_NLOEN,G_NMEN,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) ASYNC(1)
#endif

OFFSET_VAR=D_NPTRLS(MYSETW)
#ifdef OMPGPU
#endif
#ifdef ACCGPU
!$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,RET_REAL,RET_COMPLEX) FIRSTPRIVATE(KF_CURRENT,&
!$ACC&              KF_TOTAL,OFFSET_VAR,G_NLOEN_MAX) DEFAULT(NONE) &
!$ACC&              ASYNC(1) TILE(32,16,1)
#endif
DO KGL=1,D_NDGL_FS
  DO JF=1,KF_CURRENT
    DO JM=0,G_NLOEN_MAX/2
      IGLG = OFFSET_VAR+KGL-1

      ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have
      ! to fill those floor(NLON/2)+1 values.
      ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values.
      IF (JM <= G_NLOEN(IGLG)/2) THEN
        RET_REAL = 0.0_JPRBT
        RET_COMPLEX = 0.0_JPRBT
        IF (JM <= G_NMEN(IGLG)) THEN
          ISTA  = D_NPNTGTB0(JM,KGL)*KF_CURRENT*2

          RET_REAL    = FOUBUF(ISTA+2*JF-1)
          RET_COMPLEX = FOUBUF(ISTA+2*JF  )
        ENDIF
        IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL))
        PREEL_COMPLEX(IOFF_LAT+2*JM+1) = RET_REAL
        PREEL_COMPLEX(IOFF_LAT+2*JM+2) = RET_COMPLEX
      ENDIF
    ENDDO
  ENDDO
ENDDO
#ifdef OMPGPU
#endif
#ifdef ACCGPU
!$ACC END DATA

!$ACC WAIT(1)
#endif

END SUBROUTINE TRMTOL_UNPACK
END MODULE TRMTOL_PACK_UNPACK

